/**********************************************************************
gc_1_create_firm_panel.do

- clean firm cross-section

- create longitudinal firm dataset

- identify startups

**********************************************************************/
**********
* SET UP *
**********
clear all
set matsize 2000
set more 1

* location for dofiles *
cd "T:\_Projet_4915\dofiles"
global dir "`c(pwd)'"
cd $dir

*********
* GATES *
*********
* Specify which data you want to work with (synthetic = syn, real = rl) *
local ext = "rl"

* Specify which cross-sections you want to process *
local start_year = 2001
local end_year = 2012

* STEP 1: clean firm cross-section
local gate1 = 1

* STEP 2: create longitudinal firm dataset *
local gate2 = 1

* STEP 3: identify startups *
local gate3 = 1

* STEP 4: delete all intermediate datasets *
local gate4 = 1

* start log file *
quietly capture log close
quietly log using gc_1_create_firm_panel_`ext', text replace

* specify file locations *
global project_folder "\_Projet_4915"
global data_folder "\_Projet_4915\DATA"
global output_folder "\_Projet_4915\ResultsFolder"
global temp "temp"

local datadir T:\${data_folder}\

*******************************************
* MAKE A GLOBAL VARIABLE FOR TODAY'S DATE *
*******************************************
local tyr = substr("$S_DATE",8,4)
local tmo = substr("$S_DATE",4,3)
local tmd = trim(substr("$S_DATE",1,2))

* make day of month two digits *
local wl = length("`tmd'")
if `wl'==1 {
local tmd2 ="0"+"`tmd'"
}
if `wl'!=1 {
local tmd2 ="`tmd'"
}

* get numeric month, make it two digits *
local di="`tmd2'"+"`tmo'"+"`tyr'"
local edate = date("`di'", "DMY")
local mono = month(`edate')
local ml = length("`mono'")
if `ml'==1 {
local mono2 ="0"+"`mono'"
}
if `ml'!=1 {
local mono2 ="`mono'"
}

* put final date together *
global date = "`tyr'"+"`mono2'"+"`tmd2'"

****************************************
* DEFINE LITTLE PROGRAMS TO PRINT TIME *
****************************************
program define starttime
	display "Started processing at $S_TIME on $S_DATE"
end

program define endtime
	display "Finished processing at $S_TIME on $S_DATE"
end

************************
* START OF THE PROGRAM *
************************

************************************
* STEP 1: clean firm cross-section *
************************************
disp "***** Started processing STEP 1 *****"
starttime
if 1 == `gate1' ///
{
	disp "***** STEP 1: clean firm cross-section *****"
	
	forvalues y = `start_year'(1)`end_year' ///
	{
		use "`datadir'Enterprise_`y'_`ext'.dta", clear
		
		sort eid
		gen alive = 1
		
		* Specify which variables I want to keep and give them a year suffix *
		global keep_list ""
		foreach var in ///
		"alive" "naics4" "incorporationdate" "birthdate" ///
		"t4_ilu" "pd7_avgemp_nonzero" "pd7_avgpay_nonzero" "t4_payroll" "pd7_totalpayroll" ///
		"total_revenue" "sales_goods_and_services" ///
		"total_expenses"  "total_cost_of_sales" "total_operating_expenses" "purchases_cost_materials" ///
		{
			rename `var' `var'`y'
			label var `var'`y' "`var' `y' (Keep me)"
			global keep_list "$keep_list `var'"
		}

		******************************************************************
		* convert all nominal amounts into constant 2012 dollars         *
		* CPI is from Bank of Canada's core index: CANSIM Table 326-0021 *
		******************************************************************
		foreach var in ///
		"pd7_avgpay_nonzero" "t4_payroll" "pd7_totalpayroll" ///
		"total_revenue" "sales_goods_and_services" ///
		"total_expenses"  "total_cost_of_sales" "total_operating_expenses" "purchases_cost_materials" ///
		{
			replace `var'`y' = `var'`y'*1.223132037 if tax_yr == 2001
			replace `var'`y' = `var'`y'*1.195 if tax_yr == 2002
			replace `var'`y' = `var'`y'*1.16927593 if tax_yr == 2003
			replace `var'`y' = `var'`y'*1.151252408 if tax_yr == 2004
			replace `var'`y' = `var'`y'*1.132701422 if tax_yr == 2005
			replace `var'`y' = `var'`y'*1.111627907 if tax_yr == 2006
			replace `var'`y' = `var'`y'*1.088342441 if tax_yr == 2007
			replace `var'`y' = `var'`y'*1.069829902 if tax_yr == 2008
			replace `var'`y' = `var'`y'*1.05193662 if tax_yr == 2009
			replace `var'`y' = `var'`y'*1.033737024 if tax_yr == 2010
			replace `var'`y' = `var'`y'*1.017021277 if tax_yr == 2011
			replace `var'`y' = `var'`y'*1 if tax_yr == 2012
		}
		
		ds, has(varl "*(Keep me)*")
		global keep_list1 "`r(varlist)'"
		
		* keep key variables *
		keep eid ///
		$keep_list1
		
		* order key variables *
		order eid ///
		$keep_list1

		* save intermediate dataset to the data_folder *
		compress
		save "`datadir'gc_Enterprise_`y'_`ext'_clean.dta", replace
		clear
	}
	disp "***** STEP 1: clean firm cross-section (COMPLETED) *****"
}
disp "***** Finished processing STEP 1 *****"
endtime

********************************************
* STEP 2: create longitudinal firm dataset *
********************************************
disp "***** Started processing STEP 2 *****"
starttime
if 1 == `gate2' ///
{
	disp "***** STEP 2: create longitudinal firm dataset *****"
	
	* Specify which variables we want to keep again (only useful if running step 2 separately than step 1) *
	global keep_list ""
	foreach var in ///
	"alive" "naics4" "incorporationdate" "birthdate" ///
	"t4_ilu" "pd7_avgemp_nonzero" "pd7_avgpay_nonzero" "t4_payroll" "pd7_totalpayroll" ///
	"total_revenue" "sales_goods_and_services" ///
	"total_expenses"  "total_cost_of_sales" "total_operating_expenses" "purchases_cost_materials" ///
	{
		global keep_list "$keep_list `var'"
	}
	
	********************************************************
	* Use eid longitudinal mapping to create panel dataset *
	********************************************************
	use "`datadir'gc_Enterprise_2001_`ext'_clean.dta", clear
	
	rename eid predecessor_id
	merge 1:1 predecessor_id using "`datadir'Connexions_2001_2002.dta"
	drop if _merge == 2

	* replace firm ID to be equal to successor_id if the firm has a new ID next period *
	gen eid2001 = predecessor_id
	gen eid = predecessor_id
	replace eid = successor_id if _merge == 3

	/* If there is some kind of merger between periods and that one of the merging
	firm keeps its ID, give back to the firms that were acquired their original ID. */
	bys eid: gen counter_eid = _N
	bys predecessor_id: gen counter_pred = _N
	replace eid = predecessor_id if _merge == 3 & predecessor_id != successor_id & counter_eid > counter_pred
	drop _merge predecessor_id successor_id link_type
	
	local a = `start_year' + 1
	local b = `end_year' - 1
	forvalues i = `a'(1)`b' ///
	{
		local j = `i'+1
		merge 1:1 eid using "`datadir'gc_Enterprise_`i'_`ext'_clean.dta"
		drop _merge
		
		rename eid predecessor_id
		merge 1:1 predecessor_id using "`datadir'Connexions_`i'_`j'.dta"
		drop if _merge == 2
		
		* replace firm ID to be equal to successor_id if the firm has a new ID next period *
		gen eid`i' = predecessor_id
		gen eid = predecessor_id
		replace eid = successor_id if _merge == 3
		/* If there is some kind of merger between periods and that one of the merging
		firm keeps its ID, give back to the firms that were acquired their original ID. */
		drop counter*
		bys eid: gen counter_eid = _N
		bys predecessor_id: gen counter_pred = _N
		replace eid = predecessor_id if _merge == 3 & predecessor_id != successor_id & counter_eid > counter_pred
		drop _merge predecessor_id successor_id link_type
	}
	drop counter*
	merge 1:1 eid using "`datadir'gc_Enterprise_`end_year'_`ext'_clean.dta"
	drop _merge
	gen eid`end_year' = eid
	rename eid eid_long
	reshape long eid $keep_list, i(eid_long) j(tax_yr)

	drop if eid == ""
	keep if alive == 1
	drop alive
	sort eid_long tax_yr
	order eid_long tax_yr eid

	********************************************************************
	* fill in the blanks for some firms using responses in other years *
	********************************************************************
	gen inc_year_tmp = year(incorporationdate)
	by eid_long: egen incorporated = min(inc_year_tmp)
	drop inc_year_tmp incorporationdate
	
	gen birth_year_tmp = year(birthdate)
	by eid_long: egen birth = min(birth_year_tmp)
	drop birth_year_tmp birthdate
	
	rename naics4 naics
	destring naics, generate(naics4_tmp)
	by eid_long: egen naics4_int = max(naics4_tmp)
	drop naics naics4_tmp
	tostring naics4_int, gen(naics4)
	
	* save intermediate dataset to the data_folder *
	compress
	save "`datadir'gc_firm_panel_`ext'.dta", replace
	keep eid_long tax_yr eid
	compress
	save "`datadir'gc_longitudinal_eid_`ext'.dta", replace
	clear
	
	disp "***** STEP 2: create longitudinal firm dataset (COMPLETED) *****"
}
disp "***** Finished processing STEP 2 *****"
endtime


*****************************
* STEP 3: identify startups *
*****************************
disp "***** Started processing STEP 3 *****"
starttime
if 1 == `gate3' ///
{
	disp "***** STEP 3: identify startups *****"
	
	use "`datadir'gc_firm_panel_`ext'.dta", clear
	
	* employment *
	gen employment = pd7_avgemp_nonzero
	replace employment = . if employment < 1
	replace employment = t4_ilu if missing(employment)
	replace employment = . if employment < 1
	
	* revenue *
	gen revenue = sales_goods_and_services
	replace revenue = . if revenue <= 1
	replace revenue = total_revenue if missing(revenue)
	replace revenue = . if revenue <= 1
	
	* payroll *
	gen payroll = pd7_totalpayroll
	replace payroll = . if payroll <= 1
	replace payroll = t4_payroll if missing(payroll)
	replace payroll = . if payroll <= 1
	
	* define entry year as first year with positive employment, revenue, or payroll *
	gen tmp1 = .
	foreach var in "employment" "revenue" "payroll" ///
	{
			replace tmp1 = tax_yr if `var' !=. & `var' >0
	}
	sort eid_long tax_yr
	by eid_long: egen entry_year = min(tmp1)
	drop tmp1
	
	* define exit year as last year with positive employment, revenue, or payroll *
	gen tmp1 = .
	foreach var in "employment" "revenue" "payroll" ///
	{
			replace tmp1 = tax_yr if `var' !=. & `var' > 0
	}
	sort eid_long tax_yr
	by eid_long: egen exit_year = max(tmp1)
	drop tmp1

	keep if tax_yr >= entry_year & tax_yr <= exit_year
	
	sort eid_long tax_yr
	by eid_long: gen counter = _n
	
	display "Display number of startups"
	count if counter == 1
	tab entry_year exit_year if counter == 1
	
	* keep only firms with entry year after 2001 *
	keep if entry_year > 2001
	display "Display number of startups"
	count if counter == 1
	tab entry_year exit_year if counter == 1
	
	* keep only firms with non-missing incorporation year *
	keep if incorporated !=.
	display "Display number of startups"
	count if counter == 1
	tab entry_year exit_year if counter == 1
	
	* keep only firms with entry_year within two years of incorporation year *
	keep if incorporated >= entry_year - 2 & incorporated <= entry_year + 2
	display "Display number of startups"
	count if counter == 1
	tab entry_year exit_year if counter == 1

	* drop startups with very large number of employees in first year. *
	gen tmp1 = .
	replace tmp1 = employment if entry_year == tax_yr
	sort eid_long tax_yr
	by eid_long: egen entry_emp = max(tmp1)
	drop tmp1
	sum entry_emp if tax_yr == entry_year, detail
	drop if entry_emp > `r(p95)' & entry_emp != .
	display "Display number of startups"
	count if counter == 1
	tab entry_year exit_year if counter == 1
	drop counter

	* save intermediate dataset to the data_folder *
	compress
	save "`datadir'gc_startup_panel_`ext'.dta", replace

	disp "***** STEP 3: startups only (COMPLETED) *****"
}
disp "***** Finished processing STEP 3 *****"
endtime


********************************************
* STEP 4: delete all intermediate datasets *
********************************************
disp "***** Started processing STEP 4 *****"
starttime
if 1 == `gate4' ///
{
	disp "***** STEP 4: delete all intermediate datasets *****"
	
	forvalues y = `start_year'(1)`end_year' ///
	{
		capture erase "`datadir'gc_Enterprise_`y'_`ext'_clean.dta"
	}
	
	disp "***** STEP 4: delete all intermediate datasets (COMPLETED) *****"
}
disp "***** Finished processing STEP 4 *****"
endtime

*********************
* CLEAN UP AND EXIT *
*********************
clear all
log close
